# How soccer became a global sport: where did it start and what changed as more teams were starting to compete.
# Which countries have dominated the different eras of soccer since everything started.
# Cleaning, processing and first exploration
# As seen below, this data set consists of (supposedly) all games since the inaugural Scotland - England in 1872.
# For each game, we have the score, the tournament, the host city and country.
### Loading libraries
library(ggplot2) # Data visualization
## Warning: package 'ggplot2' was built under R version 4.2.2
library(readr) # CSV file I/O, e.g. the read_csv function
## Warning: package 'readr' was built under R version 4.2.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
# Reading input file.
df <- read_csv("C://Users//Nishtha//Documents//bhavuk//Semester 6//DV//J Comp//results.csv")
## Rows: 44353 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): home_team, away_team, tournament, city, country
## dbl (2): home_score, away_score
## lgl (1): neutral
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 9
## date home_team away_team home_sc…¹ away_…² tourn…³ city country neutral
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <lgl>
## 1 1872-11-30 Scotland England 0 0 Friend… Glas… Scotla… FALSE
## 2 1873-03-08 England Scotland 4 2 Friend… Lond… England FALSE
## 3 1874-03-07 Scotland England 2 1 Friend… Glas… Scotla… FALSE
## 4 1875-03-06 England Scotland 2 2 Friend… Lond… England FALSE
## 5 1876-03-04 Scotland England 3 0 Friend… Glas… Scotla… FALSE
## 6 1876-03-25 Scotland Wales 4 0 Friend… Glas… Scotla… FALSE
## # … with abbreviated variable names ¹home_score, ²away_score, ³tournament
# Let's check if we hace some NA or NULL values we should clean.
# Apparently not. Good news, let's continue.
apply(df, 2, function(v) {length(which(is.na(v) | is.null(v)))})
## date home_team away_team home_score away_score tournament city
## 0 0 0 0 0 0 0
## country neutral
## 0 0
# Let's process a bit the data so that we can have a quicker access to some important feature such as the result or the names of the winning or losing team. The outcome of a game will be encoded as D for draw, H for the home team winning and A for the away team winning. We will also extract some date-related features such as the day of week or month.
game_outcome <- function(home_score, away_score) {
outcome <- "D"
if (home_score > away_score) {outcome <- "H"}
if (home_score < away_score) {outcome <- "A"}
return(outcome)
}
winning_team <- function(home_score, away_score, home_team, away_team) {
winning_team <- NA
if (home_score > away_score) {winning_team <- home_team}
if (home_score < away_score) {winning_team <- away_team}
return(winning_team)
}
losing_team <- function(home_score, away_score, home_team, away_team) {
losing_team <- NA
if (home_score < away_score) {losing_team <- home_team}
if (home_score > away_score) {losing_team <- away_team}
return(losing_team)
}
df <- df %>%
mutate(year = format(date, "%Y"),
month = format(date, "%b"),
dayofweek = weekdays(date)) %>%
rowwise() %>%
mutate(outcome = game_outcome(home_score, away_score),
winning_team = winning_team(home_score, away_score, home_team, away_team),
losing_team = losing_team(home_score, away_score, home_team, away_team)) %>%
ungroup()
head(df)
## # A tibble: 6 × 15
## date home_…¹ away_…² home_…³ away_…⁴ tourn…⁵ city country neutral year
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <lgl> <chr>
## 1 1872-11-30 Scotla… England 0 0 Friend… Glas… Scotla… FALSE 1872
## 2 1873-03-08 England Scotla… 4 2 Friend… Lond… England FALSE 1873
## 3 1874-03-07 Scotla… England 2 1 Friend… Glas… Scotla… FALSE 1874
## 4 1875-03-06 England Scotla… 2 2 Friend… Lond… England FALSE 1875
## 5 1876-03-04 Scotla… England 3 0 Friend… Glas… Scotla… FALSE 1876
## 6 1876-03-25 Scotla… Wales 4 0 Friend… Glas… Scotla… FALSE 1876
## # … with 5 more variables: month <chr>, dayofweek <chr>, outcome <chr>,
## # winning_team <chr>, losing_team <chr>, and abbreviated variable names
## # ¹home_team, ²away_team, ³home_score, ⁴away_score, ⁵tournament
# Now, let's do some basic exploration. How many entries? Answer > 38k matches.
dim(df)
## [1] 44353 15
# A journey through the historical landscape of international soccer
# Which teams play the most?
# Let's start by checking which are the most represented teams? This will tell us which are the team with the richest history.
# Surprisingly, Sweden is the team who has played the most games. Most top 10 countries are major soccer nation such as Brazil, Argentina, England, Germany or France. Countries such as Ururguay, Mexico and Hungary are also old teams as they participated to the first world cups (1930 and/or 1934).
all_teams <- data.frame(teams = c(df$home_team, df$away_team), year=as.numeric(c(df$year, df$year)))
all_teams_count <- all_teams %>%
group_by(teams) %>%
summarise(number_games = length(teams)) %>%
arrange(desc(number_games))
head(all_teams_count, 10)
## # A tibble: 10 × 2
## teams number_games
## <chr> <int>
## 1 Sweden 1053
## 2 England 1049
## 3 Brazil 1021
## 4 Argentina 1018
## 5 Germany 986
## 6 Hungary 966
## 7 Mexico 935
## 8 Uruguay 919
## 9 South Korea 905
## 10 France 880
# It is likely all these teams have a different trajectory, some might have start playing earlier and some later. The plot below displays the cumulative sum of the number of matches for these top 10 teams. Hover the line to display the name of the team. You can also click on a team's name to hide/show it.
top_teams_games_per_year <- all_teams %>%
filter(teams %in% head(all_teams_count, 10)$teams & year < 2018) %>%
group_by(teams, year) %>%
summarise(nb_games = length(year)) %>%
mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
library(plotly)
top_teams_games_per_year <- top_teams_games_per_year %>%
arrange(teams, year) %>%
group_by(teams) %>%
mutate(cumsum=cumsum(nb_games))
p <- ggplot(top_teams_games_per_year, aes(x=year_date, y=cumsum, colour=teams, group=teams)) +
geom_line() +
labs(x="Year", y="Cumulated number of games", title="Top 10 teams in total number of games", colour="Click on a team \nto hide/show it")
ggplotly(p)
# The 10 most active teams indded have different trajectories. England gets its second positopm thanks to the many games they played in the 19th century. Some countries such as Sweden, France or Hungary have a more steady progression while teams like Korea or Mexico join the top 10 thanks to their recent hyper activity (Korea's first official games were just before 1950).
# How many games per year?
# Let's now check how many games were played each year and how the total number of international games evolve with time.
tmp <- df %>%
filter(year < 2018) %>%
mutate(year = as.numeric(year)) %>%
group_by(year) %>%
summarise(nb_games = length(date)) %>%
ungroup()
ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
geom_line() +
labs(x="Year", title="Number of international soccer games", y="") +
scale_x_continuous(breaks=seq(1870, 2020, 10))

# There are few interestings things going on here:
# * Number of games is rising, with high growth in the 80s/90s.
# * It seems there is a peak around 2010, with a slight decrease since.
# * We see a drop during world wars.
# * Since the 80s, data is very spiky, likely due to the absence/presence of world cups or other events.
#
# Let's try to visualise this to add some understanding to our plot.
wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))
tmp <- tmp %>%
mutate(is_wc = year %in% wc_years)
ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
geom_line() +
geom_point(data = tmp %>% filter(is_wc), aes(colour=is_wc)) +
labs(x="Year", title="Number of international soccer games", y="", colour="World cup year") +
geom_vline(xintercept=c(1914,1918,1939,1945), lwd=0.3, colour="gray80") +
scale_x_continuous(breaks=seq(1870, 2020, 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

# The two main drops indeed correspond to the 2 world wars but, surprisingly, the world cup years are those counting less matches.
# Let's investigate which are the most common game types and competitions every year, since 2000.
df_competitions <- df %>%
group_by(tournament, year) %>%
summarise(nb_games = length(date))
## `summarise()` has grouped output by 'tournament'. You can override using the
## `.groups` argument.
ggplot(df_competitions %>% filter(year >= 2000 & year < 2018),
aes(x=year, y=nb_games, fill=tournament)) +
geom_bar(stat="identity") +
guides(fill=FALSE) +
labs(x="Year", y="Number of games")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

# We can see that some events/tournaments are more frequent on non-world cup years such as 2007 or 2011. Let's check what they are.
df_competitions %>% filter(year == 2011) %>% arrange(desc(nb_games))
## # A tibble: 22 × 3
## # Groups: tournament [22]
## tournament year nb_games
## <chr> <chr> <int>
## 1 Friendly 2011 379
## 2 FIFA World Cup qualification 2011 216
## 3 UEFA Euro qualification 2011 154
## 4 African Cup of Nations qualification 2011 77
## 5 AFC Asian Cup 2011 32
## 6 AFC Challenge Cup qualification 2011 29
## 7 Island Games 2011 29
## 8 Pacific Games 2011 29
## 9 CECAFA Cup 2011 26
## 10 Copa América 2011 26
## # … with 12 more rows
df_competitions %>% filter(year == 2010) %>% arrange(desc(nb_games))
## # A tibble: 21 × 3
## # Groups: tournament [21]
## tournament year nb_games
## <chr> <chr> <int>
## 1 Friendly 2010 423
## 2 UEFA Euro qualification 2010 94
## 3 FIFA World Cup 2010 64
## 4 African Cup of Nations qualification 2010 48
## 5 CFU Caribbean Cup qualification 2010 34
## 6 African Cup of Nations 2010 29
## 7 AFF Championship 2010 24
## 8 AFC Asian Cup qualification 2010 19
## 9 CECAFA Cup 2010 18
## 10 CFU Caribbean Cup 2010 16
## # … with 11 more rows
# World cup qualifications generates much more matches than the world cup itself, which makes sense as the World Cup only concerns 32 countries. This is well shown in the two plost below: there is no WC qualification matches during a World Cup year and the number of qualification matches is greater than then number of WC matches by a factor 3 to 7 in general.
df_competition_filtered <- df_competitions %>%
filter(year >= 2006 & year < 2018 & tournament %in% c("Friendly","UEFA Euro qualification","FIFA World Cup", "FIFA World Cup qualification", "African Cup of Nations qualification"))
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, colour=tournament)) +
geom_point() +
geom_line() +
labs(x="Year", y="Nb games", colour="Competition")

# %% [code]
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, fill=tournament)) +
geom_bar(stat="identity") +
labs(x="Year", y="Nb games", fill="Competition")

# Worldwide soccer adoption
# When did soccer start to be widely played, i.e. when do most nations start playing international games? The plot below teaches us several things:
#
# * The number of teams steadily increased 1902 and this increase accelerated up to 1920.
# * From there, the pace of addition of new teams increase much faster and stalls abit around the late 40's
# * Then we see a steady and rapid growth up to the mid 1990's.
df_teams_start <- all_teams %>%
mutate(year = as.numeric(year)) %>%
group_by(teams) %>%
summarise(first_game = min(year))
df_year_teams_start <- df_teams_start %>%
group_by(first_game) %>%
summarise(n = length(teams)) %>%
arrange(first_game) %>%
mutate(cumsum = cumsum(n))
ggplot(df_year_teams_start, aes(x=first_game, y=cumsum)) +
geom_line() +
scale_x_continuous(breaks = seq(1870,2020, 10)) +
labs(x="Year", title="Cumulative sum of number of international soccer teams", y="")

# Which were the first and last teams to join?
# The four first teams to compete in international games were from what is now forming UK. Soccer then crossed the pond and teams such as Canada, USA, Argentina or Uruguay joined the party. In the same time, central European countries such as Austria and Hungary also join the internation arena.
# Amongst the late joiners we mostly find tiny countries (Vatican or Comoros) and recent ones (Kosovo or South Sudan). We also find Caribean or northern american islands such as which aren;t countries but collectivies or municipalities of countries such as France or Netherlands. ALthough they are not nations, they competed against other countries either in friendly games or in local tournaments.
df_teams_start %>%
arrange(first_game) %>%
head(10)
## # A tibble: 10 × 2
## teams first_game
## <chr> <dbl>
## 1 England 1872
## 2 Scotland 1872
## 3 Wales 1876
## 4 Northern Ireland 1882
## 5 Canada 1885
## 6 United States 1885
## 7 Argentina 1902
## 8 Austria 1902
## 9 Hungary 1902
## 10 Uruguay 1902
df_teams_start %>%
arrange(first_game) %>%
tail(10)
## # A tibble: 10 × 2
## teams first_game
## <chr> <dbl>
## 1 Surrey 2018
## 2 Yorkshire 2018
## 3 Chameria 2019
## 4 Saint Helena 2019
## 5 Aymara 2022
## 6 Biafra 2022
## 7 Brunei Darussalam 2022
## 8 Mapuche 2022
## 9 Maule Sur 2022
## 10 Yoruba Nation 2022
# We have seen how different teams and continent started to compete one after the others. Let's now see what did this imply for the game itself and its organisation.
# When do games occur?
# Interstingly, the very first games mostly occur on Saturdays but a decent number also took place on Mondays! No game occurred on a Sunday until 1900, potentially for religious purposes but, around the 1910's Sunday was the most common day of the week to see an international game. Other week days, from Tuesday to Friday, weren't an option until later (as late as 1910 for Fridays).
#
# The proportion of games happenning on a given day then changed quite a lot. Wednesdays games became very common and around 30% of the games happened on this day around the year 2000. More recently days such as Tuesday, Thursday or Friday also became more popular.
df_games_per_dayofweek <- df %>%
mutate(year = as.numeric(year)) %>%
filter(year < 2018) %>%
group_by(year, dayofweek) %>%
summarise(n = length(date)) %>%
group_by(year) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(dayofweek = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_dayofweek, aes(x=year, y=perc, colour=dayofweek, group=dayofweek)) +
geom_line() +
facet_wrap(~dayofweek) +
labs(x="Year", y="Percentage of games played") +
guides(colour=FALSE) +
scale_x_continuous(breaks = seq(1870, 2020, 20)) +
scale_y_continuous(breaks = seq(0,100, 10)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Now that we have looked at days, let's check whether some months are more popular for soccer games. The first games mostly occur during Spring months and since then, some month have known some peaks of popularity for intenational games at different period (e.g. many games happened in December in the 1940s).
# In a more recent history, international games became less common in May but more in June.
df_games_per_month <- df %>%
mutate(year = as.numeric(year)) %>%
filter(year < 2018) %>%
group_by(year, month) %>%
summarise(n = length(date)) %>%
group_by(year) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_month, aes(x=year, y=perc, colour=month, group=month)) +
geom_line() +
facet_wrap(~month) +
labs(x="Year", y="Percentage of games played") +
guides(colour=FALSE) +
scale_x_continuous(breaks = seq(1870, 2020, 20)) +
scale_y_continuous(breaks = seq(0,100, 10)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Evolution of results
# Let' know talk about sport and actual results! First let's check how the proportion of draws and home/away victories evolve through time. Main learnings are:
# * A victory of the home-based team has always been the most likely event.
# * A victory of the visitors is the second most likely outcome, although it tends to decrease in the second half of the 20th century.
# * A draw has always been the least likely outcome, altough it has increased in share since the 1940's.
# It is to be noted that the "home" team isn't always playing on his own country, as for example during world or continental cups.
df_outcome_per_year <- df %>%
mutate(year = as.numeric(year)) %>%
group_by(year, outcome) %>%
summarise(n = length(year)) %>%
group_by(year) %>%
mutate(total_year = sum(n),
perc = n / total_year * 100)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_outcome_per_year %>% filter(year > 1900 & year < 2018), aes(x=year, y=perc, group=outcome, colour=outcome)) +
geom_line() +
labs(x="Year", y="Percentage of games", colour="Outcome") +
geom_smooth(se=FALSE, method="loess") +
scale_x_continuous(breaks = seq(1870, 2020, 20))
## `geom_smooth()` using formula = 'y ~ x'

# Let's now get to what is at the heart of soccer: goals! How did this evolve with time?
# Although it started low (the first game resulted in a 0-0 between Scotland and England), then number of goals per games quickly skyrocketed and, before 1900, the average number of goals per game per year could be as high as 8!
# This average then stabilized around 4 until 1950 and then decreased down to 2.5 in a more modern era. The 80's has been the period were games delivered the lowest number of goals.
df_goals_per_game <- df %>%
mutate(year = as.numeric(year)) %>%
group_by(year) %>%
summarise(nb_games = length(year),
nb_goals = sum(home_score + away_score),
goals_per_game = nb_goals / nb_games)
ggplot(df_goals_per_game, aes(x=year, y = goals_per_game)) +
geom_line() +
labs(x="Year", y="", title="Average number of goals per game") +
scale_x_continuous(breaks = seq(1870, 2020, 10))
